library(tidyverse)
library(janitor)
library(knitr)
library(tidyr)
library(readxl)
library(lubridate)
library(readr)
library(stringr)
library(ggplot2)
library(tsibble)
library(dbplyr)
library(stats)
library(plotly)
library(ggridges)
library(forecast)
library(patchwork)
library(dplyr)
library(flexdashboard)
library(leaflet)
library(shiny)

library(leaflet.providers)

knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 7,
  out.width = "90%"
)


theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Describing the motivation for our project

We like many New Yorkers see rats as a major problem that has only worsened following the pandemic. New York City government agrees and has implemented and promoted a flashy new initiative they are calling “Send Rats Packing.” https://www.nyc.gov/assets/queenscb2/downloads/pdf/notices/2023/SetoutTimes-Residential-Flyer-2023-02.pdf This initiative is mainly composed of a new rule involving trash that aims to reduce the time that trash, recycling, and curbside composting will sit on the sidewalk. The new rule went into effect on April 1, 2023 and left Residential buildings and their managers with two options -Place waste out after 6:00 PM in a container of 55 gallons or less with a secure lid or Place waste out after 8:00 PM, if putting bags directly on the curb. Although everyone wants to see rats gone not everyone is on board with this new rule and many question if it is actually going to result in less rats. The NYC Building Supers an organization composed of building maintenance workers like porters and supers across the 5 boroughs has called this rule “outrageous and unfair” which requires them to work 14 hour day just to comply with the new rule. They have banded together to strike this rule by engaging in city hall protests and acts of civil disobedience by not complying with the new trash time. https://nycbuildingsupers.com/ Given this backdrop and the general skepticism of the effectiveness of the measure we were motivated to explore whether or not this trash time is effective at reducing the presence of rats across the 5 boroughs. ## Cleaning rat sightings dataset

rat_sightings = 
  read_csv ("./data/Rat_Sightings.csv") |>
  janitor::clean_names(case = "snake") |>
  separate(created_date, sep="/", into = c("month", "day", "year")) |> 
  separate(year, sep=" ", into = c("year")) |>
  filter(borough != "STATEN ISLAND") |> 
  filter(year %in% c("2019", "2020", "2021", "2022", "2023")) |>
  mutate(
    borough_id = recode(
      borough, 
      "MANHATTAN" = 1,
      "BRONX" =2,
      "BROOKLYN"=3,
      "QUEENS"= 4)) |>
  mutate(
    month = as.numeric(month),
    year = as.numeric(year)
  ) |>
  select(unique_key, month, day, year, location_type, incident_zip, borough, location, borough_id) |>
  mutate(
    borough = str_to_sentence(borough)
  )

Data Cleaning for Waste Tonnage data

waste_tonnage = read_csv("./data/DSNY_Monthly_Tonnage_Data_20231202.csv") %>%
  clean_names(case = "snake") %>%
  mutate(date_split = strsplit(month, "/")) %>%
  mutate(
    year = as.integer(sapply(date_split, function(x) x[1])),
    month = as.integer(sapply(date_split, function(x) x[2]))
  ) %>%
  filter(year %in% c(2022, 2023)) %>% 
  mutate(total_organics = resorganicstons + schoolorganictons)
## Rows: 23296 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): MONTH, BOROUGH, COMMUNITYDISTRICT
## dbl (8): REFUSETONSCOLLECTED, PAPERTONSCOLLECTED, MGPTONSCOLLECTED, RESORGAN...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waste_tonnage = waste_tonnage %>% 
  group_by(borough, month, year, borough_id) %>% 
  summarise(
    total_organics = sum(total_organics, na.rm = TRUE),
    total_refuse = sum(refusetonscollected, na.rm = TRUE)
    )
## `summarise()` has grouped output by 'borough', 'month', 'year'. You can
## override using the `.groups` argument.

Merging rat sightings and waste tonnage data

rat_waste_merged = left_join(rat_sightings, waste_tonnage, by = c("borough_id", "month", "year", "borough"))

filtering out the N/A’s in a dataset

rat_waste_filtered = subset(rat_waste_merged, !is.na(total_organics) & !is.na(total_refuse))

plot 1: total trash by borough for 2022 and 2023

waste_tonnage_by_borough = rat_waste_filtered |> 
  group_by(borough) |> 
  summarise(total_tonnage = sum(total_organics + total_refuse))

ggplot(waste_tonnage_by_borough, aes(x = reorder(borough, -total_tonnage), y = total_tonnage)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(x = "Borough", y = "Trash Tonnage", title = "Trash Tonnage by Borough, 2022 - 2023") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

bar graph by total refuse and total organics (had to separate b/c they are on a different scale)

waste_by_borough_refuse <- aggregate(total_refuse ~ borough, data=waste_tonnage, sum)

# barplot for total refuse
barplot(waste_by_borough_refuse$total_refuse,
        main="Total Refuse Production by Borough",
        xlab="Borough",
        ylab="Total Refuse (tons)",
        col="blue",
        names.arg=waste_by_borough_refuse$borough,
        las=2)

waste_by_borough_organics <- aggregate(total_organics ~ borough, data=waste_tonnage, sum)

# barplot for total organics
barplot(waste_by_borough_organics$total_organics,
        main="Total Organics Production by Borough",
        xlab="Borough",
        ylab="Total Organics (tons)",
        col="green",
        names.arg=waste_by_borough_organics$borough,
        las=2)

plot 2: trash by borough & month

total_tonnage_by_borough_year_month = rat_waste_filtered |> 
  group_by(borough, year, month) |> 
  summarise(total_tonnage = sum(total_organics + total_refuse))
## `summarise()` has grouped output by 'borough', 'year'. You can override using
## the `.groups` argument.
total_tonnage_by_borough_year_month$year_month <- as.Date(paste(total_tonnage_by_borough_year_month$year, total_tonnage_by_borough_year_month$month, "01", sep = "-"))

ggplot(total_tonnage_by_borough_year_month, aes(x = year_month, y = total_tonnage, group = borough, color = borough)) +
  geom_line() +
  labs(x = "Year-Month", y = "Amount of Trash (in tons)", title = "Trash Tonnage by Month and Borough") +
  scale_x_date(date_labels = "%Y-%m", date_breaks = "1 month") +
  scale_y_continuous(labels = scales::number_format(scale = 1e-6)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

statistical analysis 1: statistical test (ANOVA) to assess the differences in waste tonnage among the boroughs.

anova_result_organics = aov(total_organics ~ borough, data = waste_tonnage) |> broom::tidy()

anova_result_refuse = aov(total_refuse ~ borough, data = waste_tonnage) |> broom::tidy()

The one-way ANOVA test was performed to assess the differences in organic waste tonnage among the boroughs. The ANOVA test revealed statistically significant differences in organic waste tonnage among the boroughs (F(4, 112) = 10.5, p < 0.05).

A one-way ANOVA test was conducted to examine the differences in refuse waste tonnage among the boroughs.The ANOVA test revealed statistically significant differences in refuse waste tonnage among the boroughs (F(4, 112) = 82.1, p < 0.001).

Data Cleaning for Waste Tonnage data

waste_tonnage = read_csv("./data/DSNY_Monthly_Tonnage_Data_20231202.csv") %>%
  clean_names(case = "snake") %>%
  mutate(date_split = strsplit(month, "/")) %>%
  mutate(
    year = as.integer(sapply(date_split, function(x) x[1])),
    month = as.integer(sapply(date_split, function(x) x[2]))
  ) %>%
  filter(year %in% c(2022, 2023)) %>% 
  mutate(total_organics = resorganicstons + schoolorganictons)
## Rows: 23296 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): MONTH, BOROUGH, COMMUNITYDISTRICT
## dbl (8): REFUSETONSCOLLECTED, PAPERTONSCOLLECTED, MGPTONSCOLLECTED, RESORGAN...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waste_tonnage = waste_tonnage %>% 
  group_by(borough, month, year, borough_id) %>% 
  summarise(
    total_organics = sum(total_organics, na.rm = TRUE),
    total_refuse = sum(refusetonscollected, na.rm = TRUE)
    )
## `summarise()` has grouped output by 'borough', 'month', 'year'. You can
## override using the `.groups` argument.

Merging rat sightings and waste tonnage data

rat_waste_merged = left_join(rat_sightings, waste_tonnage, by = c("borough_id", "month", "year", "borough"))

One question we are interest in is how do the rat sightings differ over time and across boroughs

rat_sightings = 
  read_csv ("./data/Rat_Sightings.csv") |>
  janitor::clean_names(case = "snake") |>
  separate(created_date, sep="/", into = c("month", "day", "year")) |> 
  separate(year, sep=" ", into = c("year")) |>
  filter(borough != "STATEN ISLAND") |> 
  filter(year %in% c("2019", "2020", "2021", "2022", "2023")) |>
  mutate(
    borough_id = recode(
      borough, 
      "MANHATTAN" = 1,
      "BRONX" =2,
      "BROOKLYN"=3,
      "QUEENS"= 4)) |>
  mutate(
    month = as.numeric(month),
    year = as.numeric(year)
  ) |>
  select(unique_key, month, day, year, location_type, incident_zip, borough, location, borough_id) |>
  mutate(
    borough = str_to_sentence(borough)
  )
## Rows: 232625 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Created Date, Closed Date, Agency, Agency Name, Complaint Type, De...
## dbl  (6): Unique Key, Incident Zip, X Coordinate (State Plane), Y Coordinate...
## lgl  (7): Vehicle Type, Taxi Company Borough, Taxi Pick Up Location, Bridge ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(rat_sightings)
##    unique_key           month            day                 year     
##  Min.   :41310910   Min.   : 1.000   Length:105186      Min.   :2019  
##  1st Qu.:47382874   1st Qu.: 4.000   Class :character   1st Qu.:2020  
##  Median :52406739   Median : 7.000   Mode  :character   Median :2021  
##  Mean   :51687699   Mean   : 6.543                      Mean   :2021  
##  3rd Qu.:55862392   3rd Qu.: 9.000                      3rd Qu.:2022  
##  Max.   :59604845   Max.   :12.000                      Max.   :2023  
##                                                                       
##  location_type       incident_zip     borough            location        
##  Length:105186      Min.   :10000   Length:105186      Length:105186     
##  Class :character   1st Qu.:10038   Class :character   Class :character  
##  Mode  :character   Median :11205   Mode  :character   Mode  :character  
##                     Mean   :10783                                        
##                     3rd Qu.:11226                                        
##                     Max.   :12345                                        
##                     NA's   :31                                           
##    borough_id  
##  Min.   :1.00  
##  1st Qu.:1.00  
##  Median :3.00  
##  Mean   :2.44  
##  3rd Qu.:3.00  
##  Max.   :4.00  
##  NA's   :21
variable_types <- sapply(rat_sightings, class)
print(variable_types)
##    unique_key         month           day          year location_type 
##     "numeric"     "numeric"   "character"     "numeric"   "character" 
##  incident_zip       borough      location    borough_id 
##     "numeric"   "character"   "character"     "numeric"
# variables are not classified well for analysis so will need to convert numeric variables
numeric_vars_to_convert <- c("unique_key", "month", "year", "incident_zip", "borough_id")

rat_sightings <- rat_sightings %>% 
  mutate(across(all_of(numeric_vars_to_convert), as.factor))
        
variable_types <- sapply(rat_sightings, class)
print(variable_types)
##    unique_key         month           day          year location_type 
##      "factor"      "factor"   "character"      "factor"   "character" 
##  incident_zip       borough      location    borough_id 
##      "factor"   "character"   "character"      "factor"
# number of rat sightings by boro each year
rats_boro = rat_sightings %>% 
  janitor::clean_names() %>% 
  select(borough, year, unique_key) %>% 
  group_by(borough, year) %>% 
  count() %>% 
  summarize(avg_rat_sightings = mean(n)) %>% 
  ungroup %>% 
  spread(key = year, value = avg_rat_sightings) %>% 
  filter(borough != 'Unspecified')# I want to remove the unsepcified
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
knitr::kable(rats_boro)
borough 2019 2020 2021 2022 2023
Bronx 2789 2762 4469 4127 3378
Brooklyn 6414 5995 9500 10122 9597
Manhattan 4318 4180 6947 7455 6232
Queens 2522 2711 3400 4092 4155

We can see from the kable output that there was quite a substantial jump in rat sightings from 2020 to 2021 in all of the boroughs. This may be another COVID phenomena as restaurants shifted to more outdoor dining which deposited more food waste and other things that attract rats onto the streets during the pandemic and after the pandemic when indoor dining became less feasible. https://apnews.com/article/rats-new-york-9dc65afa66a3535cba01b1ea866973a1#:~:text=NEW%20YORK%20(AP)%20%E2%80%94%20They,so%20did%20the%20city’s%20rats.

Are the differences we can see in average rate sighting across time and boroughs statistically significant?

# I will test the statistical difference of average rat sighting across boroughs and across time.


rat_sightings_agg = rat_sightings |> 
  group_by(year, borough, month) |> 
  filter(borough != "Unspecified") %>% 
  summarise(count = n())
## `summarise()` has grouped output by 'year', 'borough'. You can override using
## the `.groups` argument.
anova_result = aov(count ~ factor(year) * factor(borough), data = rat_sightings_agg) |> broom::tidy()

anova_result_no_interaction = aov(count ~ factor(year) + factor(borough), data = rat_sightings_agg) |> broom::tidy()


# Print the summary to get F-statistic and p-value
anova1_summary <- summary(anova_result)
knitr::kable(anova1_summary)
term df sumsq meansq statistic p.value
Length:4 Min. : 3.00 Min. : 548327 Min. : 16574 Min. : 2.757 Min. :0.0000000
Class :character 1st Qu.: 3.75 1st Qu.:1776348 1st Qu.: 38414 1st Qu.: 17.863 1st Qu.:0.0000000
Mode :character Median : 8.00 Median :2882831 Median : 296058 Median : 32.969 Median :0.0000000
NA Mean : 58.75 Mean :3310296 Mean : 729439 Mean : 58.348 Mean :0.0005540
NA 3rd Qu.: 63.00 3rd Qu.:4416779 3rd Qu.: 987083 3rd Qu.: 86.144 3rd Qu.:0.0008311
NA Max. :216.00 Max. :6927195 Max. :2309065 Max. :139.319 Max. :0.0016621
NA NA NA NA NA’s :1 NA’s :1
anova2_summary <- summary(anova_result_no_interaction)
knitr::kable(anova2_summary)
term df sumsq meansq statistic p.value
Length:3 Min. : 3.00 Min. :2185688 Min. : 18107 Min. : 30.18 Min. :0
Class :character 1st Qu.: 3.50 1st Qu.:3156994 1st Qu.: 282264 1st Qu.: 54.52 1st Qu.:0
Mode :character Median : 4.00 Median :4128300 Median : 546422 Median : 78.85 Median :0
NA Mean : 78.33 Mean :4413728 Mean : 957865 Mean : 78.85 Mean :0
NA 3rd Qu.:116.00 3rd Qu.:5527748 3rd Qu.:1427744 3rd Qu.:103.19 3rd Qu.:0
NA Max. :228.00 Max. :6927195 Max. :2309065 Max. :127.53 Max. :0
NA NA NA NA NA’s :1 NA’s :1

From the ANOVA tests above we can see from the p-value of both tests being <0.001 we can reject the null hypotheses and conclude that at least one of the average rat sightings by borough and by year are statistically different.

Distributions

rat_sightings_agg <- rat_sightings_agg %>%
  mutate(common_date = paste0(year, "-", month),
         common_date = lubridate::ym(common_date))

# Plot rat sightings over time
rats_yr_plot <- rat_sightings_agg %>%
  ggplot(aes(x = common_date, y = count, group = year, color = year)) +
  geom_point(alpha = .3) +
  geom_smooth(se = FALSE) +
  facet_wrap(~ borough, scales = "free_y", ncol = 2) +  # Facet wrap by borough
  labs(
    title = "Rat Sightings Over Time",
       x = "Date",
       y = "Rat Sightings Count") +
  theme(text = element_text(size = 15), 
        axis.text.x = element_text(angle = 60, hjust = 1, size = 10)) +
  scale_colour_discrete("Year") +
  scale_x_date(date_breaks = "3 month", labels = function(x) format(x, "%b")) +
  scale_color_viridis_d(end = .8)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
print(rats_yr_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplotly(rats_yr_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
rat_sightings_tsibble <- rat_sightings_agg %>%
  mutate(common_date = paste0(year, "-", month),
         common_date = lubridate::ym(common_date)) %>%
  as_tsibble(index = common_date, key = borough) %>%
  select(borough, common_date, count)  # Include common_date in the select statement
## Adding missing grouping variables: `year`
# Time series plot
rats_time_series_plot <- rat_sightings_tsibble %>%
  ggplot(aes(x = common_date, y = count, color = borough, group = borough)) +
  geom_line() +
  labs(title = "Rat Sightings Time Series",
       x = "Date",
       y = "Rat Sightings Count",
       color = "Borough") +
  theme_minimal()

# Print the plot
print(rats_time_series_plot)

ggplotly(rats_time_series_plot)
borough_data <- rat_sightings_tsibble %>%
  filter(borough == "Brooklyn")  

# Convert data to a univariate time series
ts_data <- ts(borough_data$count, frequency = 12)  
# Time series analysis using STL decomposition
ts_analysis <- stlf(ts_data, h = 12)  

# Plot time series decomposition
autoplot(ts_analysis) +
  labs(title = "Time Series Decomposition",
       y = "Rat Sightings Count",
       color = "Components") +
  theme_minimal()

# Obtain forecasts
forecast_result <- ts_analysis %>%
  forecast(h = 12)  


# Plot forecasts
autoplot(forecast_result) +
  labs(title = "Rat Sightings Forecast",
       x = "Date",
       y = "Rat Sightings Count") +
  theme_minimal()

rat_sightings_tsibble2 <- rat_sightings_agg %>%
  mutate(common_date = paste0(year, "-", month, "-01"),
         common_date = lubridate::ymd(common_date),
         rule_impact = ifelse(common_date >= "2023-04-01", "After", "Before")) %>%
  as_tsibble(index = common_date, key = c(borough, rule_impact)) %>%
  select(borough, common_date, count, rule_impact)
## Adding missing grouping variables: `year`
# Plot to analyze the impact of the rule
rats_rule_impact_plot <- rat_sightings_tsibble2 %>%
  ggplot(aes(x = common_date, y = count, color = rule_impact, group = interaction(borough, rule_impact))) +
  geom_line() +
  labs(title = "Impact of Rat Extermination Rule",
       x = "Date",
       y = "Rat Sightings Count",
       color = "Rule Impact",
       subtitle = "Comparison Before and After April 2013") +
  theme(text = element_text(size = 15), 
        axis.text.x = element_text(angle = 60, hjust = 1, size = 10)) +
  scale_colour_discrete("Year") +
  scale_x_date(date_breaks = "3 month", labels = function(x) format(x, "%b")) +
  scale_color_viridis_d(end = .8)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Print the plot
print(rats_rule_impact_plot)

ggplotly(rats_rule_impact_plot)

Visualizations

viz1_data = rats_boro %>% 
  pivot_longer(cols = starts_with("20"),
               names_to = "Year",
               values_to = "avg_rat_sightings"
  )

  ggplot(viz1_data, aes(x = Year, y = avg_rat_sightings)) +
   geom_point(alpha = 0.3, size = 2) +
   geom_line(size = 1, alpha = 0.6) +
   facet_wrap(~borough, scales = "free_y") +
     theme(legend.position = "bottom",
         axis.text.y = element_text(color = "black", 
                                    size = 10,  hjust = 1), 
         axis.text.x = element_text(angle = 45, 
                                    hjust = 1, size = 10)) +
   labs(
     x = "Year",
     y = "Average Rat Sightings",
     title = "Average Rate Sightings From 2019-2023 by Borough"
   ) + 
     viridis::scale_colour_viridis() 
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# I cannot seem to get the line to form so I will try it with a different data format 
  
 ggplot(rat_sightings_agg, aes(x = year, y = count)) +
   geom_point(alpha = 0.3, size = 2) +
   geom_line(size = 1, alpha = 0.6) +
   facet_wrap(~borough, scales = "free_y") +
     theme(legend.position = "bottom",
         axis.text.y = element_text(color = "black", 
                                    size = 10,  hjust = 1), 
         axis.text.x = element_text(angle = 45, 
                                    hjust = 1, size = 10)) +
   labs(
     x = "Year",
     y = "Average Rat Sightings",
     title = "Average Rate Sightings From 2019-2023 by Borough"
   ) + 
     viridis::scale_colour_viridis() 

 # This graph is also not what I had in mind